library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(leaflet)
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(readxl)
knitr::opts_chunk$set(
fig.width = 6,
fig.asp = .6,
out.width = "90%"
)
theme_set(theme_minimal() + theme(legend.position = "bottom"))
options(
ggplot2.continuous.colour = "viridis",
ggplot2.continuous.fill = "viridis"
)
scale_colour_discrete = scale_color_viridis_d
scale_fill_discrete = scale_fill_viridis_d
Read in wine data.
wine_df =
read_csv(
"./wine_data/tidy/wine_all.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Warning: Duplicated column names deduplicated: 'X1' => 'X1_1' [2]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## X1_1 = col_double(),
## country = col_character(),
## description = col_character(),
## designation = col_character(),
## points = col_double(),
## price = col_double(),
## province = col_character(),
## region_1 = col_character(),
## taster_name = col_character(),
## title = col_character(),
## variety = col_character(),
## winery = col_character(),
## year = col_double(),
## type = col_character()
## )
### remove region 2, taster twitter and missing values in region 1.
#wine_type <- read_csv("./wine_data/winemag-data-130k-v2.csv") %>%
# group_by(variety) %>%
# count() %>%
# arrange(desc(n)) %>%
# as.tibble()
Make a plot of distribution of price/rating by type
wine_df %>%
filter(!is.na(type),
price <= 200) %>%
ggplot(aes(x = type, y = price, color = type)) +
geom_violin()
wine_df %>%
filter(!is.na(type)) %>%
rename(rating = points) %>%
ggplot(aes(x = type, y = rating, color = type)) +
geom_violin()
Make a plot of distribution of price/rating by region
y <- list(
title = "Mean Price"
)
wine_df %>%
filter(!is.na(price)) %>%
group_by(country) %>%
summarise(mean = mean(price)) %>%
mutate(country = fct_reorder(country, mean),
mean = round(mean, 2),
text_label=str_c("Country:", country, "\nmean price:", mean)) %>%
plot_ly(
x = ~country, y = ~mean, color = ~country, text = ~text_label,
type = "bar", colors = "viridis") %>%
layout(yaxis = y)
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: Ignoring 1 observations
y <- list(
title = "Mean rating"
)
wine_df %>%
filter(!is.na(points)) %>%
group_by(country) %>%
summarise(mean = mean(points)) %>%
mutate(country = fct_reorder(country, mean),
mean = round(mean, 2),
text_label=str_c("Country:", country, "\nmean rating:", mean)) %>%
plot_ly(
x = ~country, y = ~mean, color = ~country, text = ~text_label,
type = "bar", colors = "viridis") %>%
layout(yaxis = y)
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning: Ignoring 1 observations
Even though Switzerland has the highest priced wine, the average ratings is only ranked at 11.
Make plots of ratings/price by year by wine type.
wine_df %>%
filter(!is.na(points),
!is.na(type),
year > 1900) %>%
group_by(year,type) %>%
summarise(mean = mean(points)) %>%
mutate(mean = round(mean, 2)) %>%
ggplot(aes(x = year, y = mean, color = type))+
geom_line() +
labs(y = "Mean rating",
x = "Year",
title = "change in mean ratings by wine type and year")
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
wine_df %>%
filter(!is.na(price),
!is.na(type),
year > 1900) %>%
group_by(year,type) %>%
summarise(mean = mean(price)) %>%
mutate(mean = round(mean, 2)) %>%
ggplot(aes(x = year, y = mean, color = type))+
geom_point()+
geom_line() +
labs(y = "Mean price",
x = "Year",
title = "change in mean price by wine type in 1900-2017")
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
wine_df %>%
filter(!is.na(price),
!is.na(type),
year > 2000) %>%
group_by(year,type) %>%
summarise(mean = mean(price)) %>%
mutate(mean = round(mean, 2)) %>%
ggplot(aes(x = year, y = mean, color = type))+
geom_point()+
geom_line() +
labs(y = "Mean price",
x = "Year",
title = "change in mean price by wine type in 21 century")
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
Make plots of ratings by taster.
y <- list(
title = "Mean ratings"
)
wine_df %>%
filter(!is.na(points)) %>%
group_by(taster_name) %>%
summarise(mean = mean(points)) %>%
mutate(taster_name = fct_reorder(taster_name, mean),
mean = round(mean, 2),
text_label=str_c("Taster:", taster_name, "\nmean rating:", mean)) %>%
plot_ly(
x = ~taster_name, y = ~mean, color = ~taster_name, text = ~text_label,
type = "bar", colors = "viridis") %>%
layout(yaxis = y)
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning: Ignoring 1 observations
who rates the highest price wine? (greater than $800)
wine_df %>%
filter(price >= 800,
!is.na(taster_name)) %>%
group_by(taster_name) %>%
count()
## # A tibble: 5 x 2
## # Groups: taster_name [5]
## taster_name n
## <chr> <int>
## 1 Anne Krebiehl MW 1
## 2 Joe Czerwinski 3
## 3 Kerin O’Keefe 1
## 4 Matt Kettmann 1
## 5 Roger Voss 24
Most of the most expensive wine are rated by Roger Voss.
wine_df %>%
filter(!is.na(taster_name)) %>%
group_by(taster_name) %>%
count() %>%
arrange(desc(n)) %>%
knitr::kable()
| taster_name | n |
|---|---|
| Roger Voss | 25514 |
| Michael Schachner | 15134 |
| Kerin O’Keefe | 10776 |
| Virginie Boone | 9537 |
| Paul Gregutt | 9532 |
| Matt Kettmann | 6332 |
| Joe Czerwinski | 5147 |
| Sean P. Sullivan | 4966 |
| Anna Lee C. Iijima | 4415 |
| Jim Gordon | 4177 |
| Anne Krebiehl MW | 3685 |
| Lauren Buzzeo | 1835 |
| Susan Kostrzewa | 1085 |
| Mike DeSimone | 514 |
| Jeff Jenssen | 491 |
| Alexander Peartree | 415 |
| Carrie Dykes | 139 |
| Fiona Adams | 27 |
| Christina Pickard | 6 |
Roger Voss rates most number of wine at WineEthusiast Magazine.
Get the winery which has the top average ratings wine.
wine_df %>%
filter(!is.na(winery)) %>%
group_by(winery,country) %>%
summarise(mean = mean(points)) %>%
arrange(desc(mean)) %>%
top_n(10)
## `summarise()` regrouping output by 'winery' (override with `.groups` argument)
## Selecting by mean
## # A tibble: 16,961 x 3
## # Groups: winery [16,757]
## winery country mean
## <chr> <chr> <dbl>
## 1 Araujo US 98
## 2 Gandona US 97
## 3 J.L. Chave France 97
## 4 Ovid US 97
## 5 Standish Australia 97
## 6 Salon France 96.8
## 7 Tenuta dell'Ornellaia Italy 96.7
## 8 Château Pétrus France 96.7
## 9 Barons de Rothschild France 96
## 10 Bryant Family US 96
## # … with 16,951 more rows